1 tidy data

UHF_zipcode = 
  read_csv("./data/appb.csv") %>% 
  slice(-43) %>% 
  select(-Borough) %>% 
  rename("UHF" = "UHF Neighborhood") %>% 
  janitor::clean_names()
raw_hiv = 
  GET("https://data.cityofnewyork.us/api/views/fju2-rdad/rows.csv") %>% 
  content("parsed") %>% 
  janitor::clean_names()

combine_hiv = 
  right_join(UHF_zipcode, raw_hiv, by = "uhf") %>%
  janitor::clean_names() %>% 
  separate(zip_code, into = c("zipcode1", "zipcode2", "zipcode3", 
                              "zipcode4", "zipcode5", "zipcode6", "zipcode7", "zipcode8",
                              "zipcode9"), sep = ", ") %>% 
  gather(key = zip_code, value = zipcode_value, zipcode1:zipcode9) %>% 
  filter(!is.na(zipcode_value)) %>% 
  rename("zipcode" = "zipcode_value") %>% 
  select(zipcode, everything(), -zip_code)
r = GET('http://data.beta.nyc//dataset/3bf5fb73-edb5-4b05-bb29-7c95f4a727fc/resource/6df127b1-6d04-4bb7-b983-07402a2c3f90/download/f4129d9aa6dd4281bc98d0f701629b76nyczipcodetabulationareas.geojson')
nyc_zipcode = readOGR(content(r,'text'), 'OGRGeoJSON', verbose = F)

zipcode_lat_lng = nyc_zipcode@data %>% 
  select(zipcode = postalCode, longitude, latitude) %>% 
  mutate(zipcode = as.character(zipcode))
  
combine_hiv1 = 
  full_join(zipcode_lat_lng, combine_hiv, by = "zipcode") %>% 
  mutate(longitude = as.numeric(longitude), latitude = as.numeric(latitude)) %>% 
  group_by(uhf) %>% 
  summarise(lng = mean(longitude),
            lat = mean(latitude)) %>% 
  filter(!(uhf == "Pelhem - Throgs Neck"))
pums_raw = read_csv("./data/selected_pums.csv")

temp = tempfile(fileext = ".xls")

dataURL = "http://faculty.baruch.cuny.edu/geoportal/resources/nyc_geog/nyc_zcta10_to_puma10.xls"

download.file(dataURL, destfile = temp, mode = 'wb')

zcta_to_puma = readxl::read_xls(temp, sheet = 2) %>% 
  select(zcta = zcta10, puma = puma10) %>% 
  mutate(puma = as.numeric(puma))

dataURL = "http://faculty.baruch.cuny.edu/geoportal/resources/nyc_geog/zip_to_zcta10_nyc_revised.xls"

download.file(dataURL, destfile = temp, mode = 'wb')
zip_to_zcta = readxl::read_xls(temp, sheet = 2) %>% 
  select(zipcode, zcta = zcta5) 
pums_data = 
  pums_raw %>% 
  select(puma = PUMA10, income = PINCP, year = ADJINC) %>% 
  filter(puma != -9)  # remove data from 2011 due to lack of area information

pums_data$year = recode(pums_data$year, 
                        "1042852" = "2012",
                        "1025215" = "2013",  
                        "1009585" = "2014", 
                        "1001264" = "2015")   

pums_data = 
  pums_data %>% 
  group_by(year, puma) %>% 
  summarise(mid_income = median(income, na.rm = TRUE)) %>% 
  ungroup()         # calculate yearly median income for each area
puma_to_zipcode = right_join(zip_to_zcta, zcta_to_puma, by = "zcta") %>%   # generaate a puma to zipcode file
  select(puma, zipcode)

income_zipcode = right_join(pums_data, puma_to_zipcode, by = "puma") %>%  # matching zipcode with median income data
  select(year, zipcode, mid_income) %>% 
  mutate(year = as.numeric(year))

combine_hiv_income = 
  left_join(combine_hiv, income_zipcode, by = c("year", "zipcode"))

2 visualization

First, import and tidy data:

gender neighborhood VS hiv

neb_plot = hiv_data %>% 
  group_by(uhf, gender) %>% 
  filter(year != "ALL", borough != "All", uhf != "All", gender != "All") %>% 
  filter(age != "All") %>%
  summarise(sum_hiv = sum(hiv_diagnoses)) %>% 
  ggplot(aes(x = reorder(uhf, sum_hiv), y = sum_hiv, color = gender)) + 
  coord_flip() +
  geom_point() +
  labs(
        title = "Gender and Neighborhood Influence on HIV Incidence",
        x = "Neighborhood",
        y = "HIV diagnoses",
        caption = "Data from the ..."
      )

ggplotly(neb_plot)

Interpretation: The number of HIV diagnoses is apperently higher among male subgroups than female in all neighborhoods. Beford Stuyvesant - Crown Heights have the highest total HIV diagnoses and highest female HIV diagnoses cases. Chelsea - Clinton ranks first in male HIV diagnoses. Bayside - Little Neck has lowest number of HIV diagnoses for both male and female.

Gender, age vs hiv

  • Omit the data of transgender group, because the data of transgender were not divided into specific age groups.
age_plot = hiv_data %>% 
  filter(race == "All" & borough == "All" & age != "All") %>% 
  group_by(gender, age) %>% 
  summarise(sum_hiv = sum(hiv_diagnoses)) %>% 
  ggplot(aes(y = sum_hiv, x = age, fill = gender)) + 
  geom_bar(stat = "identity", alpha = 0.8, position = position_dodge()) +
  scale_fill_brewer(palette = "Dark2") +
  labs(
        title = "Gender and Age Influence on HIV Incidence",
        x = "Age range",
        y = "HIV diagnoses",
        caption = "Data from the ..."
      ) 

ggplotly(age_plot)

Interpretation: As is shown in the barchart above, in every age range, HIV incidence rate in male is significantly higher than that in female population. The potential explaination could be the gender differences with respect to HIV/AIDS depend on patterns of disease transmission. Most infections occurred in adults aged 20 to 29 years, and the incidence porpotion declines as the increase of age.

Gender race vs hiv

race_plot = hiv_data %>% 
  filter(age == "All" & borough == "All" & race != "All") %>% 
  group_by(gender, race) %>% 
  summarise(sum_hiv = sum(hiv_diagnoses)) %>% 
  ggplot(aes(y = sum_hiv, x = reorder(race, sum_hiv), fill = gender)) + 
  geom_bar(stat = "identity", alpha = 0.8, position = position_dodge()) +
  scale_fill_manual(values = c("#E69F00", "#56B4E9")) +
  labs(
        title = "Race and Gender Influence on HIV Incidence",
        x = "Race",
        y = "HIV diagnoses",
        caption = "Data from the ..."
      ) 

ggplotly(race_plot)

Interpretation: By race/ethnicity, black men/women have the highest rates of new HIV infections among all men/women. Whereas the incidence rate among Asian/Pacific Islander is the lowest given the study population in NYC. This is because some race population groups have higher rates of HIV in their communities, thus raising the risk of new infections with each sexual or drug use encounter. Plus, social, economic, and demographic factors of various race group—such as stigma, discrimination, income, education, and geographic region—could also affect their risk for HIV.

hiv diagnoses in borough with most hiv over years

hiv_data %>%
  filter(borough != "All", uhf == "All", gender == "All", age == "All", race == "All") %>% 
  group_by(borough) %>% 
  summarize(sum_hiv = sum(hiv_diagnoses)) %>% 
  arrange(desc(sum_hiv))
## # A tibble: 5 x 2
##   borough       sum_hiv
##   <chr>           <int>
## 1 Brooklyn         3815
## 2 Manhattan        3536
## 3 Bronx            2736
## 4 Queens           2327
## 5 Staten Island     217
year_plot = hiv_data %>% 
  mutate(year = as.integer(year)) %>% 
  filter(borough == "Brooklyn" & gender == "Male" & age == "20 - 29") %>% 
  group_by(year, uhf) %>% 
  summarize(sum_hiv = sum(hiv_diagnoses)) %>% 
  ggplot(aes(x = year, y = sum_hiv, color = uhf)) +
  geom_line() +
  labs(
        title = "Race and Gender Influence on HIV Incidence",
        x = "Year",
        y = "Sum of HIV diagnoses",
        caption = "Data from the ..."
      ) 
  
ggplotly(year_plot)
overall_year = hiv_data %>%
  filter(borough == "All", uhf == "All", age == "All", race == "All") %>% 
  group_by(year, gender) %>% 
  summarize(sum_hiv = sum(hiv_diagnoses)) %>% 
  ggplot(aes(x = year, y = sum_hiv, group = gender, color = gender)) +
  geom_line() +
  labs(
        title = "Yearly change trend on HIV Incidence",
        x = "Year",
        y = "Sum of HIV diagnoses",
        caption = "Data from the ..."
      ) 
ggplotly(overall_year)

Interpretation: The overall trend of total HIV incidence among study population in NYC on the decline from 2011 to 2015. The decrease in male is much significant than in female, and not very obvious in transgender group for theie low base account. The downward trend in HIV incidence rate reflects the improvment of public health practice affect the HIV incidence rate through repeated exposure to counseling (such as the promotion of condom use or safe sex or other prevention messages) and the advances in HIV treatments.

Income vs Hiv

income_plot = combine_hiv_income %>% 
  filter(year != 2011, gender == "All", age == "All", race == "All") %>% 
  group_by(uhf) %>%
  summarise(sum_hiv = sum(hiv_diagnoses), mean_income = mean(mid_income)) %>% 
  ggplot(aes(x = mean_income, y = sum_hiv)) + 
  geom_point() +
  labs(
        title = "Income Influence on HIV Incidence",
        x = "Average income of each neighborhood",
        y = "HIV diagnoses",
        caption = "Data from the ..."
      )

ggplotly(income_plot)

Interpretation: According to the scatterplot of income vs HIV diagnoses, it is obvious that the points are mostly concentrated in low income neigbourhood and the number of HIV diagnoses in high average income area( > 60000/year) centered in less than 1000 cases/neigbourhood. This result is exactly what we expected, because low-income community are tend to have insufficient healthcare supply, less insurance coverage, poor education level and inadequate epidemiology awareness, which could jointly cause the relatively high HIV incidence. So, we intend to advocate the related authorities to spare more public health resource in low-to-median-income neigbourhood to raise public awareness of HIV prevention knowledge and increase budgets of medical facilities in those areas.

Limitations:

We can not visualize the effect of age and race at the same time.

3 regression

Hiv diagnoses

income_hiv %>% 
  filter(year != "2011" & age != "All") %>%
  lm(hiv_diagnoses ~ borough + gender + age + mid_income, data = .) %>% 
  summary() %>% 
  broom::tidy() %>% 
  knitr::kable(digits = 3)
term estimate std.error statistic p.value
(Intercept) 0.983 0.302 3.252 0.001
boroughBrooklyn 0.297 0.281 1.060 0.289
boroughManhattan 3.091 0.331 9.332 0.000
boroughQueens -1.245 0.259 -4.811 0.000
boroughStaten Island -4.376 0.397 -11.016 0.000
genderMale 6.083 0.152 40.138 0.000
age20 - 29 9.600 0.262 36.576 0.000
age30 - 39 6.870 0.262 26.175 0.000
age40 - 49 4.627 0.262 17.627 0.000
age50 - 59 2.355 0.262 8.972 0.000
age60+ 0.427 0.262 1.626 0.104
mid_income 0.000 0.000 -17.851 0.000
income_hiv %>% 
  filter(year != "2011" & race != "All") %>%
  lm(hiv_diagnoses ~ borough + gender + race + mid_income, data = .) %>% 
  summary() %>% 
  broom::tidy() %>% 
  knitr::kable(digits = 3)
term estimate std.error statistic p.value
(Intercept) 1.514 0.514 2.945 0.003
boroughBrooklyn 0.357 0.493 0.724 0.469
boroughManhattan 3.710 0.582 6.376 0.000
boroughQueens -1.494 0.454 -3.287 0.001
boroughStaten Island -5.251 0.698 -7.527 0.000
genderMale 7.299 0.266 27.425 0.000
raceBlack 10.932 0.421 25.978 0.000
raceLatino/Hispanic 9.027 0.421 21.451 0.000
raceOther/Unknown -1.380 0.421 -3.278 0.001
raceWhite 3.628 0.421 8.621 0.000
mid_income 0.000 0.000 -12.197 0.000
income_plot = income_hiv %>% 
  filter(year != "2011") %>% 
  group_by(uhf, year) %>% 
  summarise(sum_hiv = mean(hiv_diagnoses), mid_in = median(mid_income)) %>% 
  ggplot(aes(x = mid_in, y = sum_hiv, color = year)) +
  geom_point() + 
  geom_smooth(method = lm) +
  theme_bw() +
  theme(legend.position = "None")
ggplotly(income_plot)

Income distribution in different neighborhood

income_dist = income_hiv %>% 
  ggplot(aes(y = mid_income, x = uhf)) +
  geom_point(alpha = 0.1) +
  coord_flip() +
  theme_bw()
ggplotly(income_dist)         

HIV diagnosis rate

income_hiv %>% 
  filter(year != "2011" & age != "All") %>%
  lm(hiv_diagnosis_rate ~ borough + gender + age + mid_income, data = .) %>% 
  summary() %>% 
  broom::tidy() %>% 
  knitr::kable(digits = 3)
term estimate std.error statistic p.value
(Intercept) 17.851 1.511 11.811 0.000
boroughBrooklyn -12.078 1.403 -8.611 0.000
boroughManhattan 15.424 1.656 9.316 0.000
boroughQueens -22.620 1.293 -17.492 0.000
boroughStaten Island -30.524 1.985 -15.377 0.000
genderMale 39.822 0.757 52.582 0.000
age20 - 29 44.060 1.312 33.589 0.000
age30 - 39 33.215 1.312 25.322 0.000
age40 - 49 27.986 1.312 21.336 0.000
age50 - 59 13.841 1.312 10.552 0.000
age60+ -4.261 1.312 -3.249 0.001
mid_income -0.001 0.000 -18.326 0.000
income_hiv %>% 
  filter(year != "2011" & race != "All") %>%
  lm(hiv_diagnosis_rate ~ borough + gender + race + mid_income, data = .) %>% 
  summary() %>% 
  broom::tidy() %>% 
  knitr::kable(digits = 3)
term estimate std.error statistic p.value
(Intercept) 0.795 2.459 0.323 0.747
boroughBrooklyn -11.951 2.357 -5.070 0.000
boroughManhattan 22.135 2.783 7.955 0.000
boroughQueens -25.218 2.173 -11.603 0.000
boroughStaten Island -31.251 3.336 -9.367 0.000
genderMale 49.480 1.273 38.875 0.000
raceBlack 61.810 2.012 30.713 0.000
raceLatino/Hispanic 34.404 2.012 17.095 0.000
raceOther/Unknown 9.809 2.012 4.874 0.000
raceWhite 12.984 2.012 6.452 0.000
mid_income 0.000 0.000 -1.729 0.084
income_plot_diag_rate = income_hiv %>% 
  filter(year != "2011") %>% 
  group_by(uhf, year) %>% 
  summarise(sum_hiv_diagnosis_rate = sum(hiv_diagnosis_rate), mid_in = median(mid_income)) %>% 
  ggplot(aes(x = mid_in, y = sum_hiv_diagnosis_rate, color = year)) +
  geom_point() + 
  geom_smooth(method = lm) +
  theme_bw() +
  theme(legend.position = "None")
ggplotly(income_plot_diag_rate)

4 map plot

map_data diagnoses

diagnoses_by_zipcode =  
  combine_hiv_income %>%
  filter(gender == "All", age == "All", race == "All") %>% 
  mutate(zipcode = factor(zipcode)) %>% 
  group_by(zipcode, uhf) %>% 
  summarise(sum_diagnoses = sum(hiv_diagnoses), sum_diagnosis_rate = sum(hiv_diagnosis_rate))
  
map_data = geo_join(nyc_zipcode, diagnoses_by_zipcode, "postalCode", "zipcode")

point

points_spdf = combine_hiv1 %>% 
  filter(!is.na(lng))
  
coordinates(points_spdf) = ~lng + lat
proj4string(points_spdf) = proj4string(nyc_zipcode)
matches = over(points_spdf, nyc_zipcode)

points = full_join(matches, rename(diagnoses_by_zipcode, postalCode = zipcode), by = "postalCode")
## Warning: Column `postalCode` joining factors with different levels,
## coercing to character vector

map1 diagnoses

pal1 = colorNumeric(palette = "Reds", domain = range(map_data@data$sum_diagnoses, na.rm = T))
# "BuPu" "viridis" "Greens""inferno"

leaflet(map_data) %>%
  addTiles() %>%  
  addPolygons(color = "black", weight = 1,
              fillColor = ~pal1(sum_diagnoses), fillOpacity = 0.8, 
              popup = ~stringr::str_c(uhf, "  sum:", factor(sum_diagnoses))) %>% 
  addMarkers(~longitude, ~latitude,
             popup = ~stringr::str_c(uhf, "  sum:", factor(sum_diagnoses)), data = points) %>%  
  addProviderTiles("CartoDB.Positron") %>%
  addLegend(position = "bottomright", pal = pal1, values = ~sum_diagnoses,
            title = "The amount of HIV diagnoses", opacity = 0.8) %>% 
  setView(-73.98, 40.75, zoom = 11)
## Warning in validateCoords(lng, lat, funcName): Data contains 127 rows with
## either missing or invalid lat/lon values and will be ignored

map2 diagnoses_rate

pal2 = colorNumeric(palette = "Reds", domain = range(map_data@data$sum_diagnosis_rate, na.rm = T))
# "BuPu" "viridis" "Greens""inferno"

leaflet(map_data) %>%
  addTiles() %>%  
  addPolygons(color = "black", weight = 1,
              fillColor = ~pal1(sum_diagnosis_rate), fillOpacity = 0.8, 
              popup = ~stringr::str_c(uhf, "  sum:", factor(sum_diagnosis_rate))) %>% 
  addMarkers(~longitude, ~latitude,
             popup = ~stringr::str_c(uhf, "  sum:", factor(sum_diagnosis_rate)), data = points) %>%  
  addProviderTiles("CartoDB.Positron") %>%
  addLegend(position = "bottomright", pal = pal2, values = ~sum_diagnosis_rate,
            title = "The amount of HIV diagnosis rate", opacity = 0.8) %>% 
  setView(-73.98, 40.75, zoom = 11)
## Warning in validateCoords(lng, lat, funcName): Data contains 127 rows with
## either missing or invalid lat/lon values and will be ignored

map_data_income

income_by_zipcode = 
  combine_hiv_income %>% 
  filter(year != "2011") %>% 
  filter(gender == "All", age == "All", race == "All") %>% 
  group_by(zipcode, uhf) %>% 
  summarise(mean_income = mean(mid_income))
  
map_data_income = geo_join(nyc_zipcode, income_by_zipcode, "postalCode", "zipcode")

map3

pal3 = colorNumeric(palette = "Greens", domain = range(map_data_income@data$mean_income, na.rm = T))

leaflet(map_data_income) %>%
  addTiles() %>%  
  addPolygons(color = "black", weight = 1,
              fillColor = ~pal3(mean_income), fillOpacity = 0.8, 
              popup = ~stringr::str_c(uhf, "  sum:", factor(mean_income))) %>% 
  addProviderTiles("CartoDB.Positron") %>%
  addLegend(position = "bottomright", pal = pal3, values = ~mean_income,
            title = "Mean income of each uhf", opacity = 0.8) %>% 
  setView(-73.98, 40.75, zoom = 11)